Title analysis: titles of male and female speakers
Title analysis
Formating tidytext
tit <- data.tit %>% dplyr::select(id,gender,position_cat, audience_n,
title_english)
text_tok <- tit %>% unnest_tokens(output=word,
input=title_english)Excluding stopwords, e.g. âandâ âorâ âtheâ âofâ âinâ.
Standardizing plurals.
# lista das stopwords em ingles
stop_w <- tibble(word = stopwords(source = "stopwords-iso"))
#retirar do corpus as stopwords
text <- text_tok %>%
anti_join(stop_w, by="word")
# retirar nĂșmeros e travessĂŁo e outras word
remover <- c("ăŒ", "1", "1st", "2", "364", "40", "70", "750", "aff", "da")
text <- text %>% filter(!word %in% remover )
# resolvendo plurais simples - sĂł cortando o S
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
"applications","approaches", "bees","builds", "birds",
"cerrados","challenges",
"continents","crops",
"decisions","declines","determines","determinants", "defenses",
"dynamics",
"economics", "ecosystems","environments", "experiences",
"forests",
"genetics","gifts","gradients","guides","impacts",
"increases","interactions","lives",
"landscapes","males","mammals", "mangroves","models","movements",
"mutualisms","networks","neotropics",
"opilions","phenotypes","plants","projects","paths", "perspectives",
"populations","promotes","relationships", "relations",
"resources","responses","roads","services","skulls","snakes","seeds",
"spaces", "spiders","stages", "trees", "variations",
"threats")
text$word[text$word %in% plural] <-
substr(text$word[text$word %in% plural],
1,nchar(text$word[text$word %in% plural])-1)- Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
c("advancement", "advance"),
c("agricultural", "agriculture"),
c("agro", "agriculture" ),
c("amazonia","amazon" ),
c("amazonian","amazon" ),
c("andean","andes"),
c("apply","application"),
c("applying","application"),
c("apidae","apis"),
c("arachnida","arachnid"),
c("argue","argument"),
c("basal", "basis"),
c("behavioral","behavior"),
c("behavioural","behavior"),
c("bignonieae", "bignoniaceae"),
c("biological", "biology"),
c("brazilian","brazil"),
c("building","build"),
c("changing", "change"),
c("cnidarian", "cnidaria"),
c("coastal","coast"),
c("colour", "color"),
c("colors", "color"),
c("communities","community" ),
c("competitive", "competition"),
c("complexity", "complex"),
c("convergences", "convergence"),
c("convergent", "convergence"),
c("cordatus","cordata.tit" ),
c("croplands","crop"),
c( "cultural", "culture"),
c("darwin's", "darwin"),
c("darwinian", "darwin"),
c("defensive", "defense"),
c("dependent","dependence"),
c("detecting","detection"),
c("determine", "determinant"),
c("developmental", "development"),
c("dispersers","dispersal"),
c("disturbed", "disturbance"),
c("diversification", "diversity"),
c("dragonflies", "dragonfly"),
c("drier", "drought"),
c("ecological", "ecology"),
c("ecologists", "ecology"),
c("endemic", "endemism"),
c("effectiveness", "efficiency"),
c("environmental", "environment"),
c("evolutionary", "evolution"),
c("expanding", "expansion"),
c("extinct", "extinction"),
c("facilitate", "facilitation"),
c("fisheries", "fishery"),
c("floral", "flora"),
c("floristic", "flora"),
c("forested", "forest"),
c("functional", "function"),
c("functionally", "function"),
c("functioning", "function"),
c("geographical", "geographic"),
c("heterogeneties", "heterogeneity"),
c("heterogeneous", "heterogeneity"),
c("histories", "history"),
c("integrated", "integration"),
c("intregating", "integration"),
c("integrative", "integration"),
c("invasive", "invasion"),
c("isotopic", "isotope"),
c("linking", "link"),
c("living", "live"),
c("mammalia", "mammal"),
c("managed", "manage"),
c("managers", "manage"),
c("mathematical", "mathematics"),
c("mates", "mating"),
c("mediated", "mediate"),
c("mechanistic", "mechanism"),
c("matrices", "matrix"),
c("migratory", "migration"),
c("mimicking", "mimicry"),
c("modeling", "model"),
c("mutualistic", "mutualism"),
c("natural", "nature"),
c("neotropical", "neotropic"),
c("northeastern", "northeast"),
c("occuring", "occur"),
c("onça", "onca"),
c("opiliones", "opilion"),
c("parasite", "parasitism"),
c("parent", "parenting"),
c("phylogenies", "phylogeny"),
c("phylogenetic", "phylogeny"),
c("phylogenomic", "phylogeny"),
c("pollinators", "pollination"),
c("protected", "protect"),
c("protective", "protect"),
c("rainfall", "rain"),
c("reconstructing", "reconstruction"),
c("regulatory", "regulation"),
c("regulates", "regulation"),
c("relation", "relationship"),
c("reproductive", "reproduction"),
c("restored", "restoration"),
c("robustness", "robust"),
c("scientific", "science"),
c("scientist", "science"),
c("sexy", "sexual"),
c("simulated", "simulation"),
c("societies", "society"),
c("social", "society"),
c("socio", "society"),
c("space", "spatial"),
c("spacio", "spatial"),
c("stabilize", "stability"),
c("stable", "stability"),
c("stories", "story"),
c("strategic", "strategy"),
c("strategies", "strategy"),
c("structured", "structure"),
c("structuring", "structure"),
c("studies", "study"),
c("studing", "study"),
c("sustainable", "sustainability"),
c("theories", "theory"),
c("theoretical", "theory"),
c("threatened", "threat"),
c("tropical", "tropic"),
c("vision", "visual")
)
lemma <- as.data.frame(lemma)
for (i in 1:dim(lemma)[1]){
text$word[text$word == lemma[i,1]] <- lemma[i,2]
}Counting words Frequency by gender
Removing stopwords, we keep 2299 words.
##
## F M
## 1062 1237
##
## F M
## others 16 10
## postdoc 179 230
## professor 163 437
## student 693 549
20 palavra mais comuns
| word | n |
|---|---|
| ecology | 50 |
| forest | 42 |
| evolution | 32 |
| landscape | 25 |
| bird | 22 |
| model | 22 |
| diversity | 21 |
| species | 21 |
| environment | 19 |
| plant | 18 |
| structure | 17 |
| atlantic | 15 |
| brazil | 15 |
| effects | 15 |
| conservation | 14 |
| interaction | 13 |
| study | 13 |
| bee | 12 |
| community | 12 |
| network | 12 |
| patterns | 12 |
| sĂŁo | 12 |
word cloud
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
col="#FCA532")Word frequencies by gender
props <- text %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]ggplot(props, aes(x=proportion_M,, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
#geom_point(size=2.5, alpha=0.5)+
geom_jitter(size=2.5, alpha=0.5)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words",
labels = percent_format(), limits=c(0.0005,0.03)) +
scale_y_log10(name="Female Most used words",
labels = percent_format(),limits=c(0.0005,0.03)) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Top 20 words by absolute differences are also indicated in text.
Correlation of word frequeency use between gender:
##
## Pearson's product-moment correlation
##
## data: props$proportion_F and props$proportion_M
## t = 14.789, df = 225, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6294218 0.7625874
## sample estimates:
## cor
## 0.7020916
Highly correlated -> it means they tend to use the same frequency of main words
prop2 <- props %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))PROFESSOR Word frequencies by gender
textprof <- text[text$position_cat == "professor",]
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(textprof$word[textprof$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(textprof$word[textprof$gender=="M"])),
col="#FCA532")propsP <- text %>% filter(position_cat == "professor")%>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]ggplot(propsP, aes(x=proportion_M, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
# geom_point(size=2.5, alpha=0.3) +
geom_jitter(size=2.5, alpha=0.3)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words",
labels = percent_format()) +
scale_y_log10(name="Female Most used words",
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.
Correlation of word frequeency use between gender:
##
## Pearson's product-moment correlation
##
## data: propsP$proportion_F and propsP$proportion_M
## t = 0.98729, df = 40, p-value = 0.3294
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1570569 0.4376523
## sample estimates:
## cor
## 0.1542369
No correlation
20 palavras mais usadas
propsP %>% pivot_longer(cols=2:3, names_to = "gender", values_to = "prop") %>%
arrange(desc(prop)) %>% slice_head(n=20) %>% pull(label) %>% unique() -> pala
propsP3 <- propsP %>% filter(word %in% pala) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propsP3, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))20 words with the largest differences in frequency
propP2 <- propsP %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))TF IDF
OBS: essas anaâlises nao ajudaram muito, tlvz nem precisem mais ficar aqui
text_id <- text %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))
#text_idtext_id$word <- as.factor(text_id$word)
text_id %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(5, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()TF IDF professors
OBS: essas anaâlises nao ajudaram muito, tlvz nem precisem mais ficar aqui
text_idP <- text %>% filter(position_cat== "professor") %>%
count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))
#text_idtext_idP$word <- as.factor(text_idP$word)
text_idP %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(5, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()Topic model
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)## AIC dAIC df
## ap_lda2 33766.0 0.0 2099
## ap_lda3 33853.1 87.1 3148
## ap_lda4 35215.2 1449.2 4197
word-topic probabilities
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 71 62
## M 83 94
## gender 1 2
## F 53% 47%
## M 47% 53%
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ gender)Topic model Professors only
matext <- text %>% filter(position_cat=="professor") %>%
count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)## AIC dAIC df
## ap_lda2 8479.2 0.0 793
## ap_lda4 9086.4 607.2 1585
## ap_lda3 9271.1 791.9 1189
word-topic probabilities
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 13 11
## M 38 34
## gender 1 2
## F 54% 46%
## M 53% 47%
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
geom_violin()+
facet_wrap(~ gender)ABSTRAC - semitniment analysis
Chapter 2, Silge & RObinson. 2018
- The NRC lexiâ con categorizes words in a binary fashion (âyesâ/ânoâ) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
## # A tibble: 13,875 Ă 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # âč 13,865 more rows
- The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
## # A tibble: 6,786 Ă 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # âč 6,776 more rows
- The AFINN lexicon assigns words with a score that runs between -5 and 5, with negâ ative scores indicating negative sentiment and positive scores indicating positive senâ timent.
## # A tibble: 2,477 Ă 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # âč 2,467 more rows
- Another one in the package
## # A tibble: 4,150 Ă 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # âč 4,140 more rows
Score words difference in female and male abstracts
affword <- get_sentiments("afinn")
affc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=mean.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
ggtitle("Mean words score per abstract and gender")affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
ggtitle("Weighted mean words score per abstract and gender")Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica
nrcword <- get_sentiments("nrc")
nrc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin()Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica
bingword <- get_sentiments("bing")
bing <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica
louword <- get_sentiments("loughran")
lou <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(louword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()